#include <algorithm>
#include <iostream>
#include <string>
#include <vector>

#include "tokenize_util.h"

bool is_number(char32_t ch) {
    return (ch >= U'0' && ch <= U'9');
}

bool is_letter(char32_t ch) {
    static const struct { char32_t start, end; } ranges[] = {
        {0x41, 0x5A},
        {0x61, 0x7A},
        {0xAA, 0xAA},
        {0xB5, 0xB5},
        {0xBA, 0xBA},
        {0xC0, 0xD6},
        {0xD8, 0xF6},
        {0xF8, 0x2C1},
        {0x2C6, 0x2D1},
        {0x2E0, 0x2E4},
        {0x2EC, 0x2EC},
        {0x2EE, 0x2EE},
        {0x370, 0x374},
        {0x376, 0x377},
        {0x37A, 0x37D},
        {0x37F, 0x37F},
        {0x386, 0x386},
        {0x388, 0x38A},
        {0x38C, 0x38C},
        {0x38E, 0x3A1},
        {0x3A3, 0x3F5},
        {0x3F7, 0x481},
        {0x48A, 0x52F},
        {0x531, 0x556},
        {0x559, 0x559},
        {0x560, 0x588},
        {0x5D0, 0x5EA},
        {0x5EF, 0x5F2},
        {0x620, 0x64A},
        {0x66E, 0x66F},
        {0x671, 0x6D3},
        {0x6D5, 0x6D5},
        {0x6E5, 0x6E6},
        {0x6EE, 0x6EF},
        {0x6FA, 0x6FC},
        {0x6FF, 0x6FF},
        {0x710, 0x710},
        {0x712, 0x72F},
        {0x74D, 0x7A5},
        {0x7B1, 0x7B1},
        {0x7CA, 0x7EA},
        {0x7F4, 0x7F5},
        {0x7FA, 0x7FA},
        {0x800, 0x815},
        {0x81A, 0x81A},
        {0x824, 0x824},
        {0x828, 0x828},
        {0x840, 0x858},
        {0x860, 0x86A},
        {0x870, 0x887},
        {0x889, 0x88F},
        {0x8A0, 0x8C9},
        {0x904, 0x939},
        {0x93D, 0x93D},
        {0x950, 0x950},
        {0x958, 0x961},
        {0x971, 0x980},
        {0x985, 0x98C},
        {0x98F, 0x990},
        {0x993, 0x9A8},
        {0x9AA, 0x9B0},
        {0x9B2, 0x9B2},
        {0x9B6, 0x9B9},
        {0x9BD, 0x9BD},
        {0x9CE, 0x9CE},
        {0x9DC, 0x9DD},
        {0x9DF, 0x9E1},
        {0x9F0, 0x9F1},
        {0x9FC, 0x9FC},
        {0xA05, 0xA0A},
        {0xA0F, 0xA10},
        {0xA13, 0xA28},
        {0xA2A, 0xA30},
        {0xA32, 0xA33},
        {0xA35, 0xA36},
        {0xA38, 0xA39},
        {0xA59, 0xA5C},
        {0xA5E, 0xA5E},
        {0xA72, 0xA74},
        {0xA85, 0xA8D},
        {0xA8F, 0xA91},
        {0xA93, 0xAA8},
        {0xAAA, 0xAB0},
        {0xAB2, 0xAB3},
        {0xAB5, 0xAB9},
        {0xABD, 0xABD},
        {0xAD0, 0xAD0},
        {0xAE0, 0xAE1},
        {0xAF9, 0xAF9},
        {0xB05, 0xB0C},
        {0xB0F, 0xB10},
        {0xB13, 0xB28},
        {0xB2A, 0xB30},
        {0xB32, 0xB33},
        {0xB35, 0xB39},
        {0xB3D, 0xB3D},
        {0xB5C, 0xB5D},
        {0xB5F, 0xB61},
        {0xB71, 0xB71},
        {0xB83, 0xB83},
        {0xB85, 0xB8A},
        {0xB8E, 0xB90},
        {0xB92, 0xB95},
        {0xB99, 0xB9A},
        {0xB9C, 0xB9C},
        {0xB9E, 0xB9F},
        {0xBA3, 0xBA4},
        {0xBA8, 0xBAA},
        {0xBAE, 0xBB9},
        {0xBD0, 0xBD0},
        {0xC05, 0xC0C},
        {0xC0E, 0xC10},
        {0xC12, 0xC28},
        {0xC2A, 0xC39},
        {0xC3D, 0xC3D},
        {0xC58, 0xC5A},
        {0xC5C, 0xC5D},
        {0xC60, 0xC61},
        {0xC80, 0xC80},
        {0xC85, 0xC8C},
        {0xC8E, 0xC90},
        {0xC92, 0xCA8},
        {0xCAA, 0xCB3},
        {0xCB5, 0xCB9},
        {0xCBD, 0xCBD},
        {0xCDC, 0xCDE},
        {0xCE0, 0xCE1},
        {0xCF1, 0xCF2},
        {0xD04, 0xD0C},
        {0xD0E, 0xD10},
        {0xD12, 0xD3A},
        {0xD3D, 0xD3D},
        {0xD4E, 0xD4E},
        {0xD54, 0xD56},
        {0xD5F, 0xD61},
        {0xD7A, 0xD7F},
        {0xD85, 0xD96},
        {0xD9A, 0xDB1},
        {0xDB3, 0xDBB},
        {0xDBD, 0xDBD},
        {0xDC0, 0xDC6},
        {0xE01, 0xE30},
        {0xE32, 0xE33},
        {0xE40, 0xE46},
        {0xE81, 0xE82},
        {0xE84, 0xE84},
        {0xE86, 0xE8A},
        {0xE8C, 0xEA3},
        {0xEA5, 0xEA5},
        {0xEA7, 0xEB0},
        {0xEB2, 0xEB3},
        {0xEBD, 0xEBD},
        {0xEC0, 0xEC4},
        {0xEC6, 0xEC6},
        {0xEDC, 0xEDF},
        {0xF00, 0xF00},
        {0xF40, 0xF47},
        {0xF49, 0xF6C},
        {0xF88, 0xF8C},
        {0x1000, 0x102A},
        {0x103F, 0x103F},
        {0x1050, 0x1055},
        {0x105A, 0x105D},
        {0x1061, 0x1061},
        {0x1065, 0x1066},
        {0x106E, 0x1070},
        {0x1075, 0x1081},
        {0x108E, 0x108E},
        {0x10A0, 0x10C5},
        {0x10C7, 0x10C7},
        {0x10CD, 0x10CD},
        {0x10D0, 0x10FA},
        {0x10FC, 0x1248},
        {0x124A, 0x124D},
        {0x1250, 0x1256},
        {0x1258, 0x1258},
        {0x125A, 0x125D},
        {0x1260, 0x1288},
        {0x128A, 0x128D},
        {0x1290, 0x12B0},
        {0x12B2, 0x12B5},
        {0x12B8, 0x12BE},
        {0x12C0, 0x12C0},
        {0x12C2, 0x12C5},
        {0x12C8, 0x12D6},
        {0x12D8, 0x1310},
        {0x1312, 0x1315},
        {0x1318, 0x135A},
        {0x1380, 0x138F},
        {0x13A0, 0x13F5},
        {0x13F8, 0x13FD},
        {0x1401, 0x166C},
        {0x166F, 0x167F},
        {0x1681, 0x169A},
        {0x16A0, 0x16EA},
        {0x16F1, 0x16F8},
        {0x1700, 0x1711},
        {0x171F, 0x1731},
        {0x1740, 0x1751},
        {0x1760, 0x176C},
        {0x176E, 0x1770},
        {0x1780, 0x17B3},
        {0x17D7, 0x17D7},
        {0x17DC, 0x17DC},
        {0x1820, 0x1878},
        {0x1880, 0x1884},
        {0x1887, 0x18A8},
        {0x18AA, 0x18AA},
        {0x18B0, 0x18F5},
        {0x1900, 0x191E},
        {0x1950, 0x196D},
        {0x1970, 0x1974},
        {0x1980, 0x19AB},
        {0x19B0, 0x19C9},
        {0x1A00, 0x1A16},
        {0x1A20, 0x1A54},
        {0x1AA7, 0x1AA7},
        {0x1B05, 0x1B33},
        {0x1B45, 0x1B4C},
        {0x1B83, 0x1BA0},
        {0x1BAE, 0x1BAF},
        {0x1BBA, 0x1BE5},
        {0x1C00, 0x1C23},
        {0x1C4D, 0x1C4F},
        {0x1C5A, 0x1C7D},
        {0x1C80, 0x1C8A},
        {0x1C90, 0x1CBA},
        {0x1CBD, 0x1CBF},
        {0x1CE9, 0x1CEC},
        {0x1CEE, 0x1CF3},
        {0x1CF5, 0x1CF6},
        {0x1CFA, 0x1CFA},
        {0x1D00, 0x1DBF},
        {0x1E00, 0x1F15},
        {0x1F18, 0x1F1D},
        {0x1F20, 0x1F45},
        {0x1F48, 0x1F4D},
        {0x1F50, 0x1F57},
        {0x1F59, 0x1F59},
        {0x1F5B, 0x1F5B},
        {0x1F5D, 0x1F5D},
        {0x1F5F, 0x1F7D},
        {0x1F80, 0x1FB4},
        {0x1FB6, 0x1FBC},
        {0x1FBE, 0x1FBE},
        {0x1FC2, 0x1FC4},
        {0x1FC6, 0x1FCC},
        {0x1FD0, 0x1FD3},
        {0x1FD6, 0x1FDB},
        {0x1FE0, 0x1FEC},
        {0x1FF2, 0x1FF4},
        {0x1FF6, 0x1FFC},
        {0x2071, 0x2071},
        {0x207F, 0x207F},
        {0x2090, 0x209C},
        {0x2102, 0x2102},
        {0x2107, 0x2107},
        {0x210A, 0x2113},
        {0x2115, 0x2115},
        {0x2119, 0x211D},
        {0x2124, 0x2124},
        {0x2126, 0x2126},
        {0x2128, 0x2128},
        {0x212A, 0x212D},
        {0x212F, 0x2139},
        {0x213C, 0x213F},
        {0x2145, 0x2149},
        {0x214E, 0x214E},
        {0x2183, 0x2184},
        {0x2C00, 0x2CE4},
        {0x2CEB, 0x2CEE},
        {0x2CF2, 0x2CF3},
        {0x2D00, 0x2D25},
        {0x2D27, 0x2D27},
        {0x2D2D, 0x2D2D},
        {0x2D30, 0x2D67},
        {0x2D6F, 0x2D6F},
        {0x2D80, 0x2D96},
        {0x2DA0, 0x2DA6},
        {0x2DA8, 0x2DAE},
        {0x2DB0, 0x2DB6},
        {0x2DB8, 0x2DBE},
        {0x2DC0, 0x2DC6},
        {0x2DC8, 0x2DCE},
        {0x2DD0, 0x2DD6},
        {0x2DD8, 0x2DDE},
        {0x2E2F, 0x2E2F},
        {0x3005, 0x3006},
        {0x3031, 0x3035},
        {0x303B, 0x303C},
        {0x3041, 0x3096},
        {0x309D, 0x309F},
        {0x30A1, 0x30FA},
        {0x30FC, 0x30FF},
        {0x3105, 0x312F},
        {0x3131, 0x318E},
        {0x31A0, 0x31BF},
        {0x31F0, 0x31FF},
        {0x3400, 0x4DBF},
        {0x4E00, 0xA48C},
        {0xA4D0, 0xA4FD},
        {0xA500, 0xA60C},
        {0xA610, 0xA61F},
        {0xA62A, 0xA62B},
        {0xA640, 0xA66E},
        {0xA67F, 0xA69D},
        {0xA6A0, 0xA6E5},
        {0xA717, 0xA71F},
        {0xA722, 0xA788},
        {0xA78B, 0xA7DC},
        {0xA7F1, 0xA801},
        {0xA803, 0xA805},
        {0xA807, 0xA80A},
        {0xA80C, 0xA822},
        {0xA840, 0xA873},
        {0xA882, 0xA8B3},
        {0xA8F2, 0xA8F7},
        {0xA8FB, 0xA8FB},
        {0xA8FD, 0xA8FE},
        {0xA90A, 0xA925},
        {0xA930, 0xA946},
        {0xA960, 0xA97C},
        {0xA984, 0xA9B2},
        {0xA9CF, 0xA9CF},
        {0xA9E0, 0xA9E4},
        {0xA9E6, 0xA9EF},
        {0xA9FA, 0xA9FE},
        {0xAA00, 0xAA28},
        {0xAA40, 0xAA42},
        {0xAA44, 0xAA4B},
        {0xAA60, 0xAA76},
        {0xAA7A, 0xAA7A},
        {0xAA7E, 0xAAAF},
        {0xAAB1, 0xAAB1},
        {0xAAB5, 0xAAB6},
        {0xAAB9, 0xAABD},
        {0xAAC0, 0xAAC0},
        {0xAAC2, 0xAAC2},
        {0xAADB, 0xAADD},
        {0xAAE0, 0xAAEA},
        {0xAAF2, 0xAAF4},
        {0xAB01, 0xAB06},
        {0xAB09, 0xAB0E},
        {0xAB11, 0xAB16},
        {0xAB20, 0xAB26},
        {0xAB28, 0xAB2E},
        {0xAB30, 0xAB5A},
        {0xAB5C, 0xAB69},
        {0xAB70, 0xABE2},
        {0xAC00, 0xD7A3},
        {0xD7B0, 0xD7C6},
        {0xD7CB, 0xD7FB},
        {0xF900, 0xFA6D},
        {0xFA70, 0xFAD9},
        {0xFB00, 0xFB06},
        {0xFB13, 0xFB17},
        {0xFB1D, 0xFB1D},
        {0xFB1F, 0xFB28},
        {0xFB2A, 0xFB36},
        {0xFB38, 0xFB3C},
        {0xFB3E, 0xFB3E},
        {0xFB40, 0xFB41},
        {0xFB43, 0xFB44},
        {0xFB46, 0xFBB1},
        {0xFBD3, 0xFD3D},
        {0xFD50, 0xFD8F},
        {0xFD92, 0xFDC7},
        {0xFDF0, 0xFDFB},
        {0xFE70, 0xFE74},
        {0xFE76, 0xFEFC},
        {0xFF21, 0xFF3A},
        {0xFF41, 0xFF5A},
        {0xFF66, 0xFFBE},
        {0xFFC2, 0xFFC7},
        {0xFFCA, 0xFFCF},
        {0xFFD2, 0xFFD7},
        {0xFFDA, 0xFFDC},
        {0x10000, 0x1000B},
        {0x1000D, 0x10026},
        {0x10028, 0x1003A},
        {0x1003C, 0x1003D},
        {0x1003F, 0x1004D},
        {0x10050, 0x1005D},
        {0x10080, 0x100FA},
        {0x10280, 0x1029C},
        {0x102A0, 0x102D0},
        {0x10300, 0x1031F},
        {0x1032D, 0x10340},
        {0x10342, 0x10349},
        {0x10350, 0x10375},
        {0x10380, 0x1039D},
        {0x103A0, 0x103C3},
        {0x103C8, 0x103CF},
        {0x10400, 0x1049D},
        {0x104B0, 0x104D3},
        {0x104D8, 0x104FB},
        {0x10500, 0x10527},
        {0x10530, 0x10563},
        {0x10570, 0x1057A},
        {0x1057C, 0x1058A},
        {0x1058C, 0x10592},
        {0x10594, 0x10595},
        {0x10597, 0x105A1},
        {0x105A3, 0x105B1},
        {0x105B3, 0x105B9},
        {0x105BB, 0x105BC},
        {0x105C0, 0x105F3},
        {0x10600, 0x10736},
        {0x10740, 0x10755},
        {0x10760, 0x10767},
        {0x10780, 0x10785},
        {0x10787, 0x107B0},
        {0x107B2, 0x107BA},
        {0x10800, 0x10805},
        {0x10808, 0x10808},
        {0x1080A, 0x10835},
        {0x10837, 0x10838},
        {0x1083C, 0x1083C},
        {0x1083F, 0x10855},
        {0x10860, 0x10876},
        {0x10880, 0x1089E},
        {0x108E0, 0x108F2},
        {0x108F4, 0x108F5},
        {0x10900, 0x10915},
        {0x10920, 0x10939},
        {0x10940, 0x10959},
        {0x10980, 0x109B7},
        {0x109BE, 0x109BF},
        {0x10A00, 0x10A00},
        {0x10A10, 0x10A13},
        {0x10A15, 0x10A17},
        {0x10A19, 0x10A35},
        {0x10A60, 0x10A7C},
        {0x10A80, 0x10A9C},
        {0x10AC0, 0x10AC7},
        {0x10AC9, 0x10AE4},
        {0x10B00, 0x10B35},
        {0x10B40, 0x10B55},
        {0x10B60, 0x10B72},
        {0x10B80, 0x10B91},
        {0x10C00, 0x10C48},
        {0x10C80, 0x10CB2},
        {0x10CC0, 0x10CF2},
        {0x10D00, 0x10D23},
        {0x10D4A, 0x10D65},
        {0x10D6F, 0x10D85},
        {0x10E80, 0x10EA9},
        {0x10EB0, 0x10EB1},
        {0x10EC2, 0x10EC7},
        {0x10F00, 0x10F1C},
        {0x10F27, 0x10F27},
        {0x10F30, 0x10F45},
        {0x10F70, 0x10F81},
        {0x10FB0, 0x10FC4},
        {0x10FE0, 0x10FF6},
        {0x11003, 0x11037},
        {0x11071, 0x11072},
        {0x11075, 0x11075},
        {0x11083, 0x110AF},
        {0x110D0, 0x110E8},
        {0x11103, 0x11126},
        {0x11144, 0x11144},
        {0x11147, 0x11147},
        {0x11150, 0x11172},
        {0x11176, 0x11176},
        {0x11183, 0x111B2},
        {0x111C1, 0x111C4},
        {0x111DA, 0x111DA},
        {0x111DC, 0x111DC},
        {0x11200, 0x11211},
        {0x11213, 0x1122B},
        {0x1123F, 0x11240},
        {0x11280, 0x11286},
        {0x11288, 0x11288},
        {0x1128A, 0x1128D},
        {0x1128F, 0x1129D},
        {0x1129F, 0x112A8},
        {0x112B0, 0x112DE},
        {0x11305, 0x1130C},
        {0x1130F, 0x11310},
        {0x11313, 0x11328},
        {0x1132A, 0x11330},
        {0x11332, 0x11333},
        {0x11335, 0x11339},
        {0x1133D, 0x1133D},
        {0x11350, 0x11350},
        {0x1135D, 0x11361},
        {0x11380, 0x11389},
        {0x1138B, 0x1138B},
        {0x1138E, 0x1138E},
        {0x11390, 0x113B5},
        {0x113B7, 0x113B7},
        {0x113D1, 0x113D1},
        {0x113D3, 0x113D3},
        {0x11400, 0x11434},
        {0x11447, 0x1144A},
        {0x1145F, 0x11461},
        {0x11480, 0x114AF},
        {0x114C4, 0x114C5},
        {0x114C7, 0x114C7},
        {0x11580, 0x115AE},
        {0x115D8, 0x115DB},
        {0x11600, 0x1162F},
        {0x11644, 0x11644},
        {0x11680, 0x116AA},
        {0x116B8, 0x116B8},
        {0x11700, 0x1171A},
        {0x11740, 0x11746},
        {0x11800, 0x1182B},
        {0x118A0, 0x118DF},
        {0x118FF, 0x11906},
        {0x11909, 0x11909},
        {0x1190C, 0x11913},
        {0x11915, 0x11916},
        {0x11918, 0x1192F},
        {0x1193F, 0x1193F},
        {0x11941, 0x11941},
        {0x119A0, 0x119A7},
        {0x119AA, 0x119D0},
        {0x119E1, 0x119E1},
        {0x119E3, 0x119E3},
        {0x11A00, 0x11A00},
        {0x11A0B, 0x11A32},
        {0x11A3A, 0x11A3A},
        {0x11A50, 0x11A50},
        {0x11A5C, 0x11A89},
        {0x11A9D, 0x11A9D},
        {0x11AB0, 0x11AF8},
        {0x11BC0, 0x11BE0},
        {0x11C00, 0x11C08},
        {0x11C0A, 0x11C2E},
        {0x11C40, 0x11C40},
        {0x11C72, 0x11C8F},
        {0x11D00, 0x11D06},
        {0x11D08, 0x11D09},
        {0x11D0B, 0x11D30},
        {0x11D46, 0x11D46},
        {0x11D60, 0x11D65},
        {0x11D67, 0x11D68},
        {0x11D6A, 0x11D89},
        {0x11D98, 0x11D98},
        {0x11DB0, 0x11DDB},
        {0x11EE0, 0x11EF2},
        {0x11F02, 0x11F02},
        {0x11F04, 0x11F10},
        {0x11F12, 0x11F33},
        {0x11FB0, 0x11FB0},
        {0x12000, 0x12399},
        {0x12480, 0x12543},
        {0x12F90, 0x12FF0},
        {0x13000, 0x1342F},
        {0x13441, 0x13446},
        {0x13460, 0x143FA},
        {0x14400, 0x14646},
        {0x16100, 0x1611D},
        {0x16800, 0x16A38},
        {0x16A40, 0x16A5E},
        {0x16A70, 0x16ABE},
        {0x16AD0, 0x16AED},
        {0x16B00, 0x16B2F},
        {0x16B40, 0x16B43},
        {0x16B63, 0x16B77},
        {0x16B7D, 0x16B8F},
        {0x16D40, 0x16D6C},
        {0x16E40, 0x16E7F},
        {0x16EA0, 0x16EB8},
        {0x16EBB, 0x16ED3},
        {0x16F00, 0x16F4A},
        {0x16F50, 0x16F50},
        {0x16F93, 0x16F9F},
        {0x16FE0, 0x16FE1},
        {0x16FE3, 0x16FE3},
        {0x16FF2, 0x16FF3},
        {0x17000, 0x18CD5},
        {0x18CFF, 0x18D1E},
        {0x18D80, 0x18DF2},
        {0x1AFF0, 0x1AFF3},
        {0x1AFF5, 0x1AFFB},
        {0x1AFFD, 0x1AFFE},
        {0x1B000, 0x1B122},
        {0x1B132, 0x1B132},
        {0x1B150, 0x1B152},
        {0x1B155, 0x1B155},
        {0x1B164, 0x1B167},
        {0x1B170, 0x1B2FB},
        {0x1BC00, 0x1BC6A},
        {0x1BC70, 0x1BC7C},
        {0x1BC80, 0x1BC88},
        {0x1BC90, 0x1BC99},
        {0x1D400, 0x1D454},
        {0x1D456, 0x1D49C},
        {0x1D49E, 0x1D49F},
        {0x1D4A2, 0x1D4A2},
        {0x1D4A5, 0x1D4A6},
        {0x1D4A9, 0x1D4AC},
        {0x1D4AE, 0x1D4B9},
        {0x1D4BB, 0x1D4BB},
        {0x1D4BD, 0x1D4C3},
        {0x1D4C5, 0x1D505},
        {0x1D507, 0x1D50A},
        {0x1D50D, 0x1D514},
        {0x1D516, 0x1D51C},
        {0x1D51E, 0x1D539},
        {0x1D53B, 0x1D53E},
        {0x1D540, 0x1D544},
        {0x1D546, 0x1D546},
        {0x1D54A, 0x1D550},
        {0x1D552, 0x1D6A5},
        {0x1D6A8, 0x1D6C0},
        {0x1D6C2, 0x1D6DA},
        {0x1D6DC, 0x1D6FA},
        {0x1D6FC, 0x1D714},
        {0x1D716, 0x1D734},
        {0x1D736, 0x1D74E},
        {0x1D750, 0x1D76E},
        {0x1D770, 0x1D788},
        {0x1D78A, 0x1D7A8},
        {0x1D7AA, 0x1D7C2},
        {0x1D7C4, 0x1D7CB},
        {0x1DF00, 0x1DF1E},
        {0x1DF25, 0x1DF2A},
        {0x1E030, 0x1E06D},
        {0x1E100, 0x1E12C},
        {0x1E137, 0x1E13D},
        {0x1E14E, 0x1E14E},
        {0x1E290, 0x1E2AD},
        {0x1E2C0, 0x1E2EB},
        {0x1E4D0, 0x1E4EB},
        {0x1E5D0, 0x1E5ED},
        {0x1E5F0, 0x1E5F0},
        {0x1E6C0, 0x1E6DE},
        {0x1E6E0, 0x1E6E2},
        {0x1E6E4, 0x1E6E5},
        {0x1E6E7, 0x1E6ED},
        {0x1E6F0, 0x1E6F4},
        {0x1E6FE, 0x1E6FF},
        {0x1E7E0, 0x1E7E6},
        {0x1E7E8, 0x1E7EB},
        {0x1E7ED, 0x1E7EE},
        {0x1E7F0, 0x1E7FE},
        {0x1E800, 0x1E8C4},
        {0x1E900, 0x1E943},
        {0x1E94B, 0x1E94B},
        {0x1EE00, 0x1EE03},
        {0x1EE05, 0x1EE1F},
        {0x1EE21, 0x1EE22},
        {0x1EE24, 0x1EE24},
        {0x1EE27, 0x1EE27},
        {0x1EE29, 0x1EE32},
        {0x1EE34, 0x1EE37},
        {0x1EE39, 0x1EE39},
        {0x1EE3B, 0x1EE3B},
        {0x1EE42, 0x1EE42},
        {0x1EE47, 0x1EE47},
        {0x1EE49, 0x1EE49},
        {0x1EE4B, 0x1EE4B},
        {0x1EE4D, 0x1EE4F},
        {0x1EE51, 0x1EE52},
        {0x1EE54, 0x1EE54},
        {0x1EE57, 0x1EE57},
        {0x1EE59, 0x1EE59},
        {0x1EE5B, 0x1EE5B},
        {0x1EE5D, 0x1EE5D},
        {0x1EE5F, 0x1EE5F},
        {0x1EE61, 0x1EE62},
        {0x1EE64, 0x1EE64},
        {0x1EE67, 0x1EE6A},
        {0x1EE6C, 0x1EE72},
        {0x1EE74, 0x1EE77},
        {0x1EE79, 0x1EE7C},
        {0x1EE7E, 0x1EE7E},
        {0x1EE80, 0x1EE89},
        {0x1EE8B, 0x1EE9B},
        {0x1EEA1, 0x1EEA3},
        {0x1EEA5, 0x1EEA9},
        {0x1EEAB, 0x1EEBB},
        {0x20000, 0x2A6DF},
        {0x2A700, 0x2B81D},
        {0x2B820, 0x2CEAD},
        {0x2CEB0, 0x2EBE0},
        {0x2EBF0, 0x2EE5D},
        {0x2F800, 0x2FA1D},
        {0x30000, 0x3134A},
        {0x31350, 0x33479},
    };

    for (const auto& r : ranges) {
        if (ch >= r.start && ch <= r.end)
            return true;
    }
    return false;
}

bool is_space(char32_t cp) {
    switch (cp) {
        case 0x0009:  // TAB \t
        case 0x000A:  // LF \n
        case 0x000B:  // VT
        case 0x000C:  // FF
        case 0x000D:  // CR \r
        case 0x0020:  // Space
        case 0x00A0:  // No-Break Space
        case 0x1680:  // Ogham Space Mark
        case 0x2000:  // En Quad
        case 0x2001:  // Em Quad
        case 0x2002:  // En Space
        case 0x2003:  // Em Space
        case 0x2004:  // Three-Per-Em Space
        case 0x2005:  // Four-Per-Em Space
        case 0x2006:  // Six-Per-Em Space
        case 0x2007:  // Figure Space
        case 0x2008:  // Punctuation Space
        case 0x2009:  // Thin Space
        case 0x200A:  // Hair Space
        case 0x202F:  // Narrow No-Break Space
        case 0x205F:  // Medium Mathematical Space
        case 0x3000:  // Ideographic Space
            return true;
        default:
            return false;
    }
}

std::string str_to_lower(const std::string& input) {
    std::string result = input;
    std::transform(result.begin(), result.end(), result.begin(),
                   [](unsigned char c) { return std::tolower(c); });
    return result;
}

// UTF-8 -> Unicode code points
std::vector<char32_t> utf8_to_codepoints(const std::string& str) {
    std::vector<char32_t> codepoints;
    size_t i = 0;
    while (i < str.size()) {
        unsigned char c    = str[i];
        char32_t cp        = 0;
        size_t extra_bytes = 0;

        if ((c & 0x80) == 0)
            cp = c;
        else if ((c & 0xE0) == 0xC0) {
            cp          = c & 0x1F;
            extra_bytes = 1;
        } else if ((c & 0xF0) == 0xE0) {
            cp          = c & 0x0F;
            extra_bytes = 2;
        } else if ((c & 0xF8) == 0xF0) {
            cp          = c & 0x07;
            extra_bytes = 3;
        } else {
            ++i;
            continue;
        }  // Invalid UTF-8

        if (i + extra_bytes >= str.size())
            break;

        for (size_t j = 1; j <= extra_bytes; ++j)
            cp = (cp << 6) | (str[i + j] & 0x3F);

        codepoints.push_back(cp);
        i += 1 + extra_bytes;
    }
    return codepoints;
}

// Unicode code point -> UTF-8
std::string codepoint_to_utf8(char32_t cp) {
    std::string out;
    if (cp <= 0x7F)
        out.push_back(static_cast<char>(cp));
    else if (cp <= 0x7FF) {
        out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
    } else if (cp <= 0xFFFF) {
        out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
    } else {
        out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
        out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
    }
    return out;
}

bool starts_with(const std::vector<char32_t>& text,
                 const std::vector<char32_t>& prefix,
                 std::size_t index) {
    if (index > text.size()) {
        return false;
    }
    if (prefix.size() > text.size() - index) {
        return false;
    }
    return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
}

// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
std::vector<std::string> token_split(const std::string& text) {
    std::vector<std::string> tokens;
    auto cps = utf8_to_codepoints(text);
    size_t i = 0;

    while (i < cps.size()) {
        char32_t cp = cps[i];

        // `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
        if (cp == U'\'' && i + 1 < cps.size()) {
            std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1]));
            if (next == "s" || next == "t" || next == "m") {
                tokens.push_back("'" + next);
                i += 2;
                continue;
            }
            if (i + 2 < cps.size()) {
                next += str_to_lower(codepoint_to_utf8(cps[i + 2]));
                if (next == "re" || next == "ve" || next == "ll" || next == "d") {
                    tokens.push_back("'" + next);
                    i += 3;
                    continue;
                }
            }
        }

        // `\p{N}`
        if (is_number(cp)) {
            tokens.push_back(codepoint_to_utf8(cp));
            ++i;
            continue;
        }

        // `[^\r\n\p{L}\p{N}]?\p{L}+`
        {
            // `[^\r\n\p{L}\p{N}]\p{L}+`
            if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) {
                std::string token = codepoint_to_utf8(cp);
                ++i;

                while (i < cps.size() && is_letter(cps[i])) {
                    token += codepoint_to_utf8(cps[i]);
                    ++i;
                }
                tokens.push_back(token);
                continue;
            }

            // `\p{L}+`
            if (is_letter(cp)) {
                std::string token = codepoint_to_utf8(cp);
                ++i;
                while (i < cps.size() && is_letter(cps[i])) {
                    token += codepoint_to_utf8(cps[i]);
                    ++i;
                }
                tokens.push_back(token);
                continue;
            }
        }

        // ` ?[^\s\p{L}\p{N}]+[\r\n]*`
        {
            // ` [^\s\p{L}\p{N}]+[\r\n]*`
            if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
                std::string token = codepoint_to_utf8(cp);
                token += codepoint_to_utf8(cps[i + 1]);
                i += 2;

                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
                    token += codepoint_to_utf8(cps[i]);
                    ++i;
                }

                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
                    token += codepoint_to_utf8(cps[i]);
                    ++i;
                }

                tokens.push_back(token);
                continue;
            }

            // `[^\s\p{L}\p{N}]+[\r\n]*`
            std::string token;
            if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
                std::string token = codepoint_to_utf8(cp);
                ++i;

                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
                    token += codepoint_to_utf8(cps[i]);
                    ++i;
                }

                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
                    token += codepoint_to_utf8(cps[i]);
                    ++i;
                }

                tokens.push_back(token);
                continue;
            }
        }

        // `\s*[\r\n]+|\s+(?!\S)|\s+`
        if (is_space(cp)) {
            std::string token = codepoint_to_utf8(cp);
            ++i;

            while (i < cps.size() && is_space(cps[i])) {
                token += codepoint_to_utf8(cps[i]);
                ++i;
                if (cps[i] == U'\r' || cps[i] == U'\n') {
                    break;
                }
            }

            tokens.push_back(token);
            continue;
        }

        // skip
        ++i;
    }

    return tokens;
}

std::vector<std::string> split_with_special_tokens(
    const std::string& text,
    const std::vector<std::string>& special_tokens) {
    std::vector<std::string> result;
    size_t pos      = 0;
    size_t text_len = text.size();

    while (pos < text_len) {
        size_t next_pos = text_len;
        std::string matched_token;

        for (const auto& token : special_tokens) {
            size_t token_pos = text.find(token, pos);
            if (token_pos != std::string::npos && token_pos < next_pos) {
                next_pos      = token_pos;
                matched_token = token;
            }
        }

        if (next_pos > pos) {
            result.push_back(text.substr(pos, next_pos - pos));
        }

        if (!matched_token.empty()) {
            result.push_back(matched_token);
            pos = next_pos + matched_token.size();
        } else {
            break;
        }
    }

    return result;
}

// int main() {
//     std::string text = "I'm testing C++ token_split function. 你好，世界! 123";
//     auto tokens = token_split(text);

//     for (const auto& t : tokens) {
//         std::cout << "[" << t << "] ";
//     }
//     std::cout << "\n";
//     return 0;
// }
